%matplotlib notebook
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import pandas_profiling
# use the 'seaborn-colorblind' style
plt.style.use('seaborn-colorblind')
#load data
train_df = pd.read_csv('train_data.csv')
test_df = pd.read_csv('test_data.csv')
#Generates profile reports from train data
pandas_profiling.ProfileReport(train_df)
#Generates profile reports from test data
pandas_profiling.ProfileReport(test_df)
From this report we can notice:
* We don't have missing values in our dataset
<br>
* The values of energy consumed seems correct and it is distrubuted in a range of [6400 8500] (in both the training and testing set) .
<br>
* The energy and enthalpy variables are highly correlated espacially in the training dataset.
The correlation between these two variable in train set = 0.480433 and in the test set = -0.27703 => the correlation in test set is lower
<br>
* The training set contains only one class (good energy consumption)
<br>
* The testing set contains two classes:
<br>
good energy consumption
<br>
excessive energy consumption
<br>
=> So we can adress this problem as an anomaly detection problem
#get the pairwise correlation of all columns in the dataframe
train_df.corr()
| energy | enthalpy | |
|---|---|---|
| energy | 1.000000 | 0.480433 |
| enthalpy | 0.480433 | 1.000000 |
test_df.corr()
| energy | enthalpy | |
|---|---|---|
| energy | 1.00000 | -0.27703 |
| enthalpy | -0.27703 | 1.00000 |
plt.figure()
plt.title('Boxplot train set')
train_df.boxplot()
<AxesSubplot:title={'center':'Boxplot train set'}>
plt.figure()
plt.title('Boxplot test set')
test_df.boxplot()
<AxesSubplot:title={'center':'Boxplot test set'}>
In this case, it is not possible to identify the outlier directly from investigation one variable at the time
It is the combinaision of the energy and enthalpy variables that allows us to easily identify the anomaly
g = sns.pairplot(train_df)
g.fig.suptitle("pairwise relationships train set")
Text(0.5, 0.98, 'pairwise relationships train set')
g = sns.pairplot(test_df)
g.fig.suptitle("pairwise relationships test set")
Text(0.5, 0.98, 'pairwise relationships test set')
plt.figure()
plt.title('train and test data distribution')
sns.scatterplot(x="enthalpy", y="energy", data=train_df);
sns.scatterplot(x="enthalpy", y="energy", data=test_df);
plt.legend(labels=['train data ', 'test data'])
<matplotlib.legend.Legend at 0x2845fc9f2b0>
It is become quite easy to visually identify abnormal consumption through data points located outside the typical distribution
We have only two variables and we can clearly visualize the relation between them, so we can use linear regression to fit data, then we calculate the residual value.
By examinate the distribution of the residual values, we can chose a cutoff value or threshold.
Every sample with residual value exceds the cutoff value is considered as excessive consumption
For this problem, linear regression model can reach very goods results and we don't have to use more complicated model (Auto-encoder for example)
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
model = LinearRegression()
#create polynominal features
poly = PolynomialFeatures(degree=4)
poly_train = poly.fit_transform(train_df['enthalpy'].values.reshape(-1, 1))
poly_test = poly.transform(test_df['enthalpy'].values.reshape(-1, 1))
#fit the model
model.fit(poly_train, train_df['energy'])
LinearRegression()
#prediction
train_pred = model.predict(poly_train)
test_pred = model.predict(poly_test)
#train set
df_pred_train = train_df.copy()
df_pred_train['energy_pred'] = train_pred
#get residual values for train set
df_pred_train["Residual"] = df_pred_train["energy"] - df_pred_train["energy_pred"]
df_pred_train.head()
| date | energy | enthalpy | energy_pred | Residual | |
|---|---|---|---|---|---|
| 0 | 2019-06-02 | 6661.954286 | 30488.616385 | 6636.880547 | 25.073738 |
| 1 | 2019-06-03 | 6663.454286 | 30338.174100 | 6634.557824 | 28.896461 |
| 2 | 2019-06-04 | 6653.537143 | 30670.887292 | 6639.784463 | 13.752680 |
| 3 | 2019-06-05 | 6643.868571 | 30807.607844 | 6642.027194 | 1.841377 |
| 4 | 2019-06-06 | 6616.860000 | 30775.883551 | 6641.501871 | -24.641871 |
plt.figure()
sns.histplot(data=df_pred_train['Residual'],kde=True)
plt.title('residual distribution of training data')
Text(0.5, 1.0, 'residual distribution of training data')
From this fig, we can notice that the residual values follow a gaussian distribution, so we can use the standard deviation to define the cutoff value.
1 Standard Deviation from the Mean: 68%
2 Standard Deviations from the Mean: 95%
3 Standard Deviations from the Mean: 99.7%
from numpy import std
std = std(df_pred_train["Residual"])
print('standard deviation = ',std)
standard deviation = 111.84052961905431
df_pred_test = test_df.copy()
df_pred_test['energy_pred'] = test_pred
#get residual
df_pred_test["Residual"] = df_pred_test["energy"] - df_pred_test["energy_pred"]
plt.figure()
sns.histplot(data=df_pred_test['Residual'],kde=True)
plt.title('residual distribution of testing data')
Text(0.5, 1.0, 'residual distribution of testing data')
#train data
cutoff = 2*std
print('cutoff value = ', cutoff)
plt.figure()
plt.title('consumption efficiency')
sns.scatterplot(x="enthalpy", y="energy", data=df_pred_train, color='grey')
sns.lineplot(x="enthalpy", y="energy_pred", data = df_pred_train, color='blue')
sns.lineplot(x="enthalpy", y="energy_pred", data = pd.concat([df_pred_train['energy_pred']+cutoff,
df_pred_train['enthalpy']],axis=1, join='inner'), color='red')
plt.legend(labels=['optimal_consumption ', 'cutoff value', 'train data'])
cutoff value = 223.68105923810862
<matplotlib.legend.Legend at 0x28464f5e430>
#train data
cutoff = 1.7*std
print('cutoff value = ', cutoff)
plt.figure()
plt.title('consumption efficiency')
sns.scatterplot(x="enthalpy", y="energy", data=df_pred_train, color='grey')
sns.lineplot(x="enthalpy", y="energy_pred", data = df_pred_train, color='blue')
sns.lineplot(x="enthalpy", y="energy_pred", data = pd.concat([df_pred_train['energy_pred']+cutoff,
df_pred_train['enthalpy']],axis=1, join='inner'), color='red')
plt.legend(labels=['optimal_consumption ', 'cutoff value', 'train data'])
cutoff value = 190.12890035239232
<matplotlib.legend.Legend at 0x284620a57c0>
After some visualisations using differet value of cutoff, we notice that the value 1.7*std gives better results
The energy isn't only related to the enthalpy, it is also affected by the nunber of customers and how much time they
spend, also it can depends on the sunset time which is different between summer and winter....
That's explain the presence of some outliers in the training set
#test data
plt.figure()
plt.title('consumption efficiency')
sns.scatterplot(x="enthalpy", y="energy", data=df_pred_test, color='grey')
sns.lineplot(x="enthalpy", y="energy_pred", data = df_pred_test, color='blue')
sns.lineplot(x="enthalpy", y="energy_pred", data = pd.concat([df_pred_test['energy_pred']+cutoff,
df_pred_test['enthalpy']],axis=1, join='inner'), color='red')
plt.legend(labels=['optimal_consumption ', 'cutoff value', 'test data'])
<matplotlib.legend.Legend at 0x284617debe0>
def consumption_efficiency(enthalpy, energy):
enthalpy_poly = poly.transform(np.array(enthalpy).reshape(-1, 1))
energy_pred = model.predict(enthalpy_poly)
residual = energy - energy_pred
if residual < cutoff:
print('optimal consumption')
else:
print('excessive consumption, a normal consumption should be less then:', int(cutoff + energy_pred))
consumption_efficiency(50000,7700)
excessive consumption, a normal consumption should be less then: 7643
def days_consumption(df):
poly_df = poly.transform(df['enthalpy'].values.reshape(-1, 1))
pred = model.predict(poly_df)
df["energy_pred"] = pred
df["Residual"] = df["energy"] - df["energy_pred"]
df['efficient'] = df['Residual'] < cutoff
#if the consumption is normal => efficient = True, else efficient = False
optimal_days = df.loc[df['efficient'] == True, 'date']
not_optimal_days = df.loc[df['efficient'] == False, 'date']
test_output = df.copy()
return (test_output,optimal_days,not_optimal_days)
#optimal_days
days_consumption(test_df)[1]
0 2019-02-22
1 2019-02-23
2 2019-02-24
3 2019-02-25
4 2019-02-26
...
118 2020-09-21
127 2020-09-30
133 2020-10-06
134 2020-10-07
400 2021-06-30
Name: date, Length: 115, dtype: object
#not_optimal_days
days_consumption(test_df)[2]
57 2019-04-20
88 2019-05-21
89 2019-05-22
90 2019-05-23
91 2019-05-24
...
422 2021-07-22
423 2021-07-23
424 2021-07-24
425 2021-07-25
426 2021-07-26
Name: date, Length: 312, dtype: object
By chosing more than one catoff value, we can define several interval of consumption (low, optimal, litle bitexcessive, to excessive)
test_out,_,_ = days_consumption(test_df)
test_out.set_index('date', inplace=True,drop=True)
test_out.sort_values(by="date", inplace=True)
test_out.to_csv('test_out.csv')